/*==============================================================================
EU - IC - Educational attainment  

Outline: 
I. Import Education data from excels for the following countries: 
	BE, DE, DK, FI, IT, NL, SE

II. IT and DE universes include all individuals. Merge in current student 
enrollment in order to modify universes to subtract those currently in school 
and who are less than 6 years of age. 

A Technical Appendix to accompany this .do file is available from the authors 
upon request 

==============================================================================*/      

clear
set more off
*===============================================================================
*I. Import Education data from excels for each country. 
*===============================================================================

**********************************  Belgium   **********************************
clear
cd "$insheet_files/Belgium"
import excel using "BE_ED_1970.xls",  first

gen primary = primary_and_unknown - unknown 

*redistribute "unknown" proportionally across population 
local educ primary general_16_less technical_16_less general_17_plus technical_17_plus higher_technical arts_education teachers_college 
egen subtotal = rsum(`educ')
foreach var of varlist `educ' {
	gen share_`var'= `var'/subtotal
	replace `var' = `var' + share_`var'*unknown
}

egen edatt_primary = rsum(primary general_16_less technical_16_less)
egen edatt_secondary = rsum(general_17_plus technical_17_plus higher_technical ///
						arts_education teachers_college)
gen edatt_university = university 

gen edatt_total = total 
 
*years of school
forval n = 0/16 {
	gen edatt_yrs_`n' = .
}
 
replace edatt_yrs_6 = primary
replace edatt_yrs_9 = general_16_less + technical_16_less
replace edatt_yrs_12 = general_17_plus + technical_17_plus
replace edatt_yrs_14 = higher_technical + arts_education + teachers_college
replace edatt_yrs_16 = university

keep nuts edatt* 

gen nuts2 = substr(nuts,1,4)

collapse (sum) edatt*, by(nuts2)

rename nuts2 nuts

tempfile be_nuts2
save `be_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) edatt*, by(nuts1)

rename nuts1 nuts

tempfile be_nuts1
save `be_nuts1.dta'

gen country = substr(nuts,1,2)

collapse (sum) edatt* , by (country)

rename country nuts 

append using `be_nuts1.dta'
append using `be_nuts2.dta'

sort nuts
tempfile BE_ED
save `BE_ED.dta' 

********************************   Denmark    **********************************
clear
cd "$insheet_files/Denmark"

*Educational Attainment 
import excel using "DK_ED_1981.xls", first
drop if nuts==""

gen edatt_primary = uncompleted + basic_upper_2nd
gen edatt_secondary = voc_upper_2nd_2_3 + higher_short
gen edatt_university = higher_medium + higher_long

gen edatt_total = total - ongoing_voc - ongoing_gen 

forval n = 0/19 {
	gen edatt_yrs_`n' = .
}

replace edatt_yrs_7 = uncompleted
replace edatt_yrs_10 = basic_upper_2nd 
replace edatt_yrs_12 = voc_upper_2nd_2_3
replace edatt_yrs_14 = higher_short
replace edatt_yrs_16 = higher_medium
replace edatt_yrs_19 = higher_long

collapse (sum) edatt* , by(nuts) //these are NUTS 3

tempfile dk_nuts3
save `dk_nuts3.dta'

gen nuts2= substr(nuts,1,4) 

collapse (sum) edatt* , by(nuts2)

rename nuts2 nuts

tempfile dk_nuts2
save `dk_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) edatt* , by(nuts1)

rename nuts1 nuts

tempfile dk_nuts1
save `dk_nuts1.dta'

gen country = substr(nuts,1,2)

collapse (sum) edatt* , by(country)

rename country nuts

append using `dk_nuts1.dta'
append using `dk_nuts2.dta'
append using `dk_nuts3.dta'

tempfile DK_ED
save `DK_ED.dta'

********************************   Finland    **********************************
clear
cd "$insheet_files/Finland"
import excel using "FI_ED_1970.xls", sheet("processed") first

drop if nuts==""
egen edatt_secondary = rsum(edatt_secondary_only edatt_tertiary_1)
egen edatt_university = rsum(edatt_tertiary_2-edatt_doctorate)

gen edatt_total = pop_15_plus

forval n = 0/21 {
	gen edatt_yrs_`n' = .
}

replace edatt_yrs_6 = edatt_primary
replace edatt_yrs_12 = edatt_secondary_only
replace edatt_yrs_15 = edatt_tertiary_1
replace edatt_yrs_16 = edatt_tertiary_2 
replace edatt_yrs_20 = edatt_tertiary_3
replace edatt_yrs_21 = edatt_doctorate

keep nuts edatt_primary edatt_secondary edatt_university edatt_total edatt_yrs*
tempfile FI_ED
save `FI_ED.dta'

************************************   Sweden    *****************************************
note: universe is 16-59 years of age 
clear
cd "$insheet_files/Sweden"
import excel using "SE_ED_1970.xls", sheet("Table Transpose") cellrange(A4:Q52) first 

drop if nuts==""

drop if sex=="Women" //leaves both sexes

egen edatt_primary = rsum(elementary* primary secondary_unfinished ///
	secondary_vocational_2)
egen edatt_secondary = rsum(matriculated_secondary trade_school general_school ///
	secondary_vocational_2_plus tertiary_2_or_less)
gen edatt_university = tertiary_2_plus

gen edatt_total=total 

forval n = 0/16 {
	gen edatt_yrs_`n' = .
}

replace edatt_yrs_7 = elementary_7_less
replace edatt_yrs_8 = elementary_8_yrs
replace edatt_yrs_10 = primary_elementary_9_10 + secondary_vocational_2
replace edatt_yrs_9 = secondary_unfinished
replace edatt_yrs_13 = matriculated_secondary
replace edatt_yrs_11 = trade_school
replace edatt_yrs_12 = general_school + secondary_vocational_2_plus
replace edatt_yrs_14 = tertiary_2_or_less
replace edatt_yrs_16 = tertiary_2_plus

keep nuts edatt_primary edatt_secondary edatt_university edatt_total edatt_yrs*

collapse (sum) edatt* , by(nuts)

tempfile se_nuts3
save `se_nuts3.dta'

gen nuts2= substr(nuts,1,4) 

collapse (sum) edatt*, by(nuts2)

rename nuts2 nuts

tempfile se_nuts2
save `se_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) edatt*, by(nuts1)

rename nuts1 nuts

tempfile se_nuts1
save `se_nuts1.dta'

gen country = substr(nuts,1,2)

collapse (sum) edatt*, by(country)

rename country nuts

append using `se_nuts1.dta'
append using `se_nuts2.dta'
append using `se_nuts3.dta'

tempfile SE_ED
save `SE_ED.dta'

******************************   Netherlands    ********************************
clear
cd "$insheet_files/Netherlands"
*Note: population is ages 14+ not currently in school 
 
import excel using "NL_ED_1971.xls", cellrange(A2:L14) first sheet("processed")

drop if nuts==""

* Redistribute unknown education proportionally across the population of those 
* not currently in school
egen subtotal = rsum(basis_niveau - higher) 
foreach var of varlist basis_niveau - higher {
	gen `var'_share= `var' / subtotal
	replace `var' = `var' + `var'_share*unknown
}

egen edatt_primary = rsum(basis_niveau lower_level advanced_lower*)
egen edatt_secondary = rsum(secondary* semi_higher)
gen edatt_university = higher

rename edatt_pop edatt_total

forval n = 0/19 {
	gen edatt_yrs_`n' = .
}

replace edatt_yrs_6 = basis_niveau
replace edatt_yrs_8 = lower_level
replace edatt_yrs_10 = advanced_lower_vocational
replace edatt_yrs_12 = advanced_lower_general
replace edatt_yrs_13 = secondary_general
replace edatt_yrs_15 = secondary_vocational
replace edatt_yrs_17 = semi_higher
replace edatt_yrs_19 = higher

keep nuts edatt* 

tempfile nl_nuts2
save `nl_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) edatt*, by(nuts1)

rename nuts1 nuts

tempfile nl_nuts1
save `nl_nuts1.dta'

gen country = substr(nuts,1,2)

collapse (sum) edatt*, by(country)

rename country nuts

append using `nl_nuts1.dta'
append using `nl_nuts2.dta'

tempfile NL_ED
save `NL_ED.dta'

*********************************  Italy    ************************************
clear
cd "$insheet_files/Italy"
import excel using "IT_ED_1971.xls", cellrange(A1:H23) first sheet("processed")

drop if nuts==""

egen edatt_primary = rsum(primary middle_school)

forval n = 0/16 {
	gen edatt_yrs_`n' = .
}

replace edatt_yrs_0 = edatt_less_than_primary
replace edatt_yrs_5 = primary
replace edatt_yrs_8 = middle_school
replace edatt_yrs_13 = edatt_secondary
replace edatt_yrs_16 = edatt_university

keep nuts edatt* 

tempfile it_nuts2
save `it_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) edatt*, by(nuts1)

rename nuts1 nuts

tempfile it_nuts1
save `it_nuts1.dta'

gen country = substr(nuts,1,2)

collapse (sum) edatt*, by(country)

rename country nuts

append using `it_nuts1.dta'
append using `it_nuts2.dta'

*create any composite nuts. Must create these here to match with Yearbooks data, for universe correction that I perform below.
replace nuts ="ITH1&ITH2" if nuts=="ITH1"|nuts=="ITH2"
collapse (sum) edatt*, by(nuts)

tempfile IT_ED
save `IT_ED.dta'

************************************   Germany  ********************************
clear
cd "$insheet_files/Germany"
import excel using "DE_ED_1970.xls", cellrange(A3:CD34) first

keep if imw=="i" //keep total category

reshape long r, i(LfdNr Geg) j(region) string

*Create (in some cases rough) correspondance to modern regions
gen nuts = ""
	replace nuts = "DE11" if region=="803"|region=="805"|region=="806"
	replace nuts = "DE12" if region =="801"|region=="802"|region=="804"|region=="808"
	replace nuts = "DE13" if region=="807"|region=="811"|region=="812"|region=="813"
	replace nuts = "DE14" if region=="809"|region=="810"|region=="814" 
	replace nuts = "DE21" if region =="909"|region=="914"|region=="916"|region=="917"	
	replace nuts = "DE22" if region=="910"|region=="911"
	replace nuts = "DE23" if region=="907"|region=="908"
	replace nuts = "DE24" if region=="904"
	replace nuts = "DE25" if region=="905"|region=="906"
	replace nuts = "DE26" if region=="901"|region=="902"|region=="903"
	replace nuts = "DE27" if region=="912"|region=="913"|region=="915"
	replace nuts = "DE30" if region=="001"
	replace nuts = "DE50" if region=="202"
	replace nuts = "DE60" if region=="201"

	replace nuts = "DE71&DE72" if region=="504"|region=="505"|region=="506" 
	replace nuts = "DE73" if region=="501"|region=="502"|region=="503" 

	replace nuts = "DE91" if region=="310"|region=="311"
	replace nuts = "DE50" if region=="202"
	replace nuts = "DE92" if region=="305"|region=="308"|region=="309"
	replace nuts = "DE93" if region =="303"| region=="306"
	replace nuts = "DE94" if region=="301"|region=="302"|region=="304"|region=="307"

	replace nuts = "DEA1" if region=="403"| region=="407"|region=="408" 
	replace nuts = "DEA3" if region=="401"|region=="404" 
	replace nuts = "DEA5" if region=="405"|region=="409"|region=="410"|region=="413" 

	replace nuts = "DEA2" if region=="411"|region=="412"
	replace nuts = "DEA4" if region=="402"|region=="406"
	replace nuts = "DEB1" if region=="601"|region=="602"|region=="605"
	replace nuts = "DEB2" if region=="603"|region=="604"
	replace nuts = "DEB3" if region=="606"|region=="607"|region=="608"|region=="609"
	replace nuts = "DEC0" if region=="701"
	replace nuts = "DEF0" if region=="101"|region=="102"|region=="103"|region=="104"|region=="105"

collapse (sum) r, by(nuts GegenstandderNachweisungi LfdNr)	

destring LfdNr, replace force
drop Geg

reshape wide r, i(nuts) j(LfdNr)	
		
gen edatt_less_than_primary= r1 - r4
gen edatt_primary = r7		
egen edatt_secondary = rsum(r10 r13 r16 r19)
gen edatt_university = r22	

gen edatt_total = r1

forval n = 0/17 {
	gen edatt_yrs_`n' = .
}

replace edatt_yrs_4 = r1 - r4
replace edatt_yrs_8 = r7 
replace edatt_yrs_10 = r10
replace edatt_yrs_13 = r13
replace edatt_yrs_14 = r16 + r19
replace edatt_yrs_17 = r22

gen inschool=r25	
		
keep nuts edatt* inschool

tempfile de_nuts2
save `de_nuts2.dta'

gen nuts1 = substr(nuts,1,3)

collapse (sum) inschool edatt*, by(nuts1)

rename nuts1 nuts

tempfile de_nuts1
save `de_nuts1.dta'

gen country = substr(nuts,1,2)

collapse (sum) inschool edatt*, by(country)

rename country nuts
replace nuts = "DEF&DE6&DE5&DE9&DEA&DEB&DEC&DE1&DE2&DE3&DE7" if nuts=="DE" //West Germany

append using `de_nuts1.dta'
append using `de_nuts2.dta'

*numbers are in thousands. Multiply by 1000.
foreach var of varlist inschool edatt* {
	replace `var' = `var' *1000
}

tempfile DE_ED
save `DE_ED.dta'		
	
*===============================================================================
* Combine all countries 
*===============================================================================

clear
use `BE_ED.dta'
append using `DK_ED.dta'
append using `FI_ED.dta'
append using `SE_ED.dta'
append using `NL_ED.dta'
append using `IT_ED.dta'
append using `DE_ED.dta'

tempfile edatt_all
save `edatt_all.dta'

*===============================================================================
* II. Merge in current student enrollment & modify universes to subtract  
* those currently in school.
*===============================================================================

use "$dta_files/IC_EU_AGE.dta" , clear
gen country = substr(nuts,1,2)
keep if country=="DE"|country=="IT"
merge 1:1 nuts using "$insheet_files/student_enrollment", gen(_merge_age) keep(match master) 

replace nuts="DE11" if nuts=="DE145&DE112&DE113&DE114&DE11C&DE117&DE118&DE119&DE115&DE11D&DE116&DE11A&DE111&DE144"
replace nuts="DE12" if nuts=="DE12B&DE122&DE123&DE11B&DE127&DE125&DE126&DE129"
replace nuts="DE13" if nuts=="DE147&DE132&DE138&DE133&DE131&DE139&DE134&DE136&DE13A&DE121&DE124"
replace nuts="DE14" if nuts=="DE146&DE148&DE149&DE141&DE143&DE142&DE137&DE135&DE12C&DE12A"
replace nuts="DE73" if nuts=="DE724&DE73"
replace nuts="DE71&DE72" if nuts=="DE71&DE721&DE722&DE723&DE725"
replace nuts="DEB1" if nuts=="DEB1&DEB39&DEB3B&DEB35&DEB3J"
replace nuts="DEB3" if  nuts=="DEB31&DEB32&DEB33&DEB34&DEB36&DEB37&DEB38&DEB3A&DEB3C&DEB3D&DEB3E&DEB3F&DEB3G&DEB3H&DEB3I&DEB3K"
replace nuts="DE91" if nuts=="DE911&DE912&DE91B&DE917&DE91A&DE916"|nuts=="DE911&DE912&DE91B&DE917"|nuts=="DE91A&DE925&DE926&DE918&DE916&DE919&DE915"
replace nuts="DE92" if nuts=="DE922&DE923&DE927&DE928&DE929"|nuts=="DE925&DE926&DE918&DE919&DE915"
replace nuts="DE93" if nuts=="DE931&DE93A&DE934&DE935&DE933&DE938&DE914&DE913"|nuts=="DE932&DE939&DE937&DE93B&DE936"
replace nuts="DE94" if nuts=="DE944&DE94E&DE949&DE94B"|nuts=="DE94C&DE947&DE942&DE94H"|nuts=="DE94A&DE945&DE94G&DE946&DE943&DE94D&DE941&DE948&DE94F"
replace nuts="DEA2" if nuts=="DEA22&DEA23&DEA24&DEA27&DEA2A&DEA2B&DEA2C"|nuts=="DEA2D&DEA29&DEA26&DEA28"
collapse (sum) preschool primary secondary tertiary age*, by(nuts)

merge 1:1 nuts using `edatt_all.dta', gen(_merge_inschool)

gen country=substr(nuts,1,2) 

* Below, estimate those currently in school when this information is missing
* by assigning the population adjusted in school values from the NUTS 1 region. 

gen nuts1 = substr(nuts,1,3)
foreach var of varlist preschool-tertiary age_total {
	gen nuts1_`var' = `var' if nuts==nuts1
	bys nuts1: egen nuts1_`var'_min = min(nuts1_`var')
	drop nuts1_`var' 
	ren nuts1_`var' nuts1_`var' 
}


foreach var of varlist preschool-tertiary {
	replace `var' =  nuts1_`var'*age_total/ nuts1_age_total if (`var'==.|`var'==0)

}
drop nuts1*

* subtracting out those ages 0-6 and those currently in school.

replace edatt_yrs_4 = edatt_yrs_4 - primary - age_0_4 - age_5_9*(2/5) if country== "DE" & primary !=. & secondary !=.
replace edatt_yrs_8 = edatt_yrs_8 - secondary if country=="DE" & primary !=. & secondary !=.
 
replace edatt_less_than_primary= edatt_less_than_primary - primary - age_0_4 - age_5_9*(2/5) if country== "DE"
replace edatt_primary = edatt_primary - secondary  if country=="DE" & primary !=. & secondary !=.

*replace edatt_secondary = edatt_secondary - tertiary if country=="DE" 
replace edatt_total = edatt_total - primary - secondary - age_0_4 - age_5_9*(2/5) if country== "DE"  & primary !=. & secondary !=.

*IT
replace edatt_less_than_primary = edatt_less_than_primary - primary if country=="IT" & primary !=. & secondary !=.
replace edatt_primary = edatt_primary - secondary if country=="IT" & primary !=. & secondary !=.
replace edatt_yrs_0 = edatt_yrs_0 - primary if country=="IT" & primary !=. & secondary !=.
replace edatt_yrs_5 = edatt_yrs_5 - secondary*(3/8) if country=="IT" & primary !=. & secondary !=. //subtract out approximate enrollments in years 6,7,8 of school
replace edatt_yrs_8 = edatt_yrs_8 - secondary*(5/8) if country=="IT" & primary !=. & secondary !=. //subtract out approximate enrollments in years 9,10,11,12,13 of school
replace edatt_total = edatt_total - primary - secondary if country== "IT" & primary !=. & secondary !=.

keep nuts edatt_yrs* edatt_total

foreach var of varlist edatt* {
	replace `var' = . if `var' ==0
}

sort nuts
save "$dta_files/IC_EU_education.dta", replace

